library(class)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
library(magrittr)
library(XML)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(stringi)
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
library(ggplot2)
library(RCurl)
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(httr)
##
## Attaching package: 'httr'
## The following object is masked from 'package:plotly':
##
## config
## The following object is masked from 'package:caret':
##
## progress
library(jsonlite)
library(e1071)
library(naniar)
library(scales)
library(gghighlight)
library(stringr)
library(knitr)
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:RCurl':
##
## complete
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
##
## subplot
## The following object is masked from 'package:rvest':
##
## html
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:base':
##
## format.pval, units
library(ggpubr)
library(maps)
library(usmap)
knitr::opts_chunk$set(error = TRUE)
beers=read.csv('/Users/renfengwang/Documents/SMU\ Data\ Science\ Program/Doing\ Data\ Science/Project\ 1/Beers.csv',header=T)
brewery=read.csv('/Users/renfengwang/Documents/SMU\ Data\ Science\ Program/Doing\ Data\ Science/Project\ 1/Breweries.csv',header=T)
head(beers)
## Name Beer_ID ABV IBU Brewery_id
## 1 Pub Beer 1436 0.050 NA 409
## 2 Devil's Cup 2265 0.066 NA 178
## 3 Rise of the Phoenix 2264 0.071 NA 178
## 4 Sinister 2263 0.090 NA 178
## 5 Sex and Candy 2262 0.075 NA 178
## 6 Black Exodus 2261 0.077 NA 178
## Style Ounces
## 1 American Pale Lager 12
## 2 American Pale Ale (APA) 12
## 3 American IPA 12
## 4 American Double / Imperial IPA 12
## 5 American IPA 12
## 6 Oatmeal Stout 12
head(brewery)
## Brew_ID Name City State
## 1 1 NorthGate Brewing Minneapolis MN
## 2 2 Against the Grain Brewery Louisville KY
## 3 3 Jack's Abby Craft Lagers Framingham MA
## 4 4 Mike Hess Brewing Company San Diego CA
## 5 5 Fort Point Beer Company San Francisco CA
## 6 6 COAST Brewing Company Charleston SC
Load both CSV files and printed out the head of each data frame to check them visually.
#Question 1
brewery %>% arrange(State) %>% ggplot(aes(x=State),count=Name)+geom_bar()+geom_text(aes(label=..count..),stat='count',vjust=-.5) +
xlab('States')+ ylab('Brewery Numbers') + ggtitle('Numbers of Brewery by State') #Bar plot to count the brewery numbers in each state
Plot the numbers of brewery by state. Colorado (47) has the most breweries. California (39), Michigan (32), Oregon (29) and Texas (28) are the next top four states with max breweries.
#Question 2
colnames(beers)[colnames(beers)=='Brewery_id']='Brew_ID' #Change one of the data frame column names before merging two data sets.
df_beer=merge(beers, brewery, by='Brew_ID', all=T) #Outer join two data frames.
colnames(df_beer)[colnames(df_beer)=='Name.x']='Beer_Name'
colnames(df_beer)[colnames(df_beer)=='Name.y']='Brewery_Name'
head(df_beer)
## Brew_ID Beer_Name Beer_ID ABV IBU Style
## 1 1 Get Together 2692 0.045 50 American IPA
## 2 1 Maggie's Leap 2691 0.049 26 Milk / Sweet Stout
## 3 1 Wall's End 2690 0.048 19 English Brown Ale
## 4 1 Pumpion 2689 0.060 38 Pumpkin Ale
## 5 1 Stronghold 2688 0.060 25 American Porter
## 6 1 Parapet ESB 2687 0.056 47 Extra Special / Strong Bitter (ESB)
## Ounces Brewery_Name City State
## 1 16 NorthGate Brewing Minneapolis MN
## 2 16 NorthGate Brewing Minneapolis MN
## 3 16 NorthGate Brewing Minneapolis MN
## 4 16 NorthGate Brewing Minneapolis MN
## 5 16 NorthGate Brewing Minneapolis MN
## 6 16 NorthGate Brewing Minneapolis MN
tail(df_beer)
## Brew_ID Beer_Name Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Brewery_Name City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
Merged two data frames and printed out head and tail of the new combined data frame.
#Question 3
df_beerall=df_beer %>% replace_with_na_all(condition = ~.x=='')
df_beerall=df_beerall %>% replace_with_na_all(condition = ~.x==' ') # Replace any blank values with NA
gg_miss_var(df_beerall) #Plot to check if any column has missing values
table(is.na(df_beerall$IBU))
##
## FALSE TRUE
## 1405 1005
table(is.na(df_beerall$ABV))
##
## FALSE TRUE
## 2348 62
table(is.na(df_beerall$ABV | df_beerall$IBU)) #Printed out the numbers of missing values in ABV and IBU columns
##
## FALSE TRUE
## 2348 62
imp=mice(df_beerall, method='norm.predict', m=5)
##
## iter imp variable
## 1 1 ABV IBU
## 1 2 ABV IBU
## 1 3 ABV IBU
## 1 4 ABV IBU
## 1 5 ABV IBU
## 2 1 ABV IBU
## 2 2 ABV IBU
## 2 3 ABV IBU
## 2 4 ABV IBU
## 2 5 ABV IBU
## 3 1 ABV IBU
## 3 2 ABV IBU
## 3 3 ABV IBU
## 3 4 ABV IBU
## 3 5 ABV IBU
## 4 1 ABV IBU
## 4 2 ABV IBU
## 4 3 ABV IBU
## 4 4 ABV IBU
## 4 5 ABV IBU
## 5 1 ABV IBU
## 5 2 ABV IBU
## 5 3 ABV IBU
## 5 4 ABV IBU
## 5 5 ABV IBU
## Warning: Number of logged events: 5
beer_imp=complete(imp) #Use linear regression method to replace missing values in ABV and IBU columns.
gg_miss_var(beer_imp)
table(is.na(beer_imp$IBU))
##
## FALSE
## 2410
table(is.na(beer_imp$ABV))
##
## FALSE
## 2410
table(is.na(beer_imp$Style)) #Check how many missing values in beer style column
##
## FALSE TRUE
## 2405 5
We can see there are missing values in Style, ABV and IBU columns. In ABV column, there are 62 missing values. In IBU column, there are 1005 missing values. In Style column, there are 5 missing values. I replaced the missing values in both ABV and IBU columns by linear regression method. I will replace the missing values in Beer Style column by searching their styles online as there are only 5 missing values.
#Question 4
Median_ABV_IBU=beer_imp %>% arrange(State) %>% group_by(State) %>% summarise(Median_ABV=median(ABV, na.rm=TRUE), Median_IBU=median(IBU,na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Median_ABV_IBU %>% ggplot(aes(x=State, y=Median_ABV,width=.5))+geom_bar(stat='identity') +
geom_text(aes(label=percent(Median_ABV, accuracy = 0.01)),vjust=-.5,size=2.5,check_overlap = T) +
xlab('States') +ylab('Median Alcoholic Content')+ggtitle('Median Alcoholic Content by State')
Median_ABV_IBU %>% ggplot(aes(x=State, y=Median_IBU, width=.5))+geom_bar(stat='identity') +
geom_text(aes(label=sprintf("%0.1f", round(Median_IBU, digits = 1))),vjust=-.5,size=2.5,check_overlap = T) +
xlab('States') +ylab('Median International Bitterness')+ggtitle('Median International Bitterness by State')
Plot median ABV and median IBU by each state. Washington DC and Kentucky have the highest median ABV values which are 6.25%. Maine has the highest median IBU value 61 followed by West Virginia 57.5.
#Question 5
Max_ABV_IBU=beer_imp %>% arrange(State) %>% group_by(State) %>% summarise(Max_ABV=max(ABV, na.rm=TRUE), Max_IBU=max(IBU,na.rm=T))
## `summarise()` ungrouping output (override with `.groups` argument)
Max_ABV_IBU %>% ggplot(aes(x=State, y=Max_ABV,width=.5))+geom_bar(stat='identity') +
geom_text(aes(label=percent(Max_ABV, accuracy = 0.01)),vjust=-.5,size=2.5,check_overlap = T) +
xlab('States') +ylab('Max Alcoholic Content')+ggtitle('Max Alcoholic Content by State')
Max_ABV_IBU %>% ggplot(aes(x=State, y=Max_IBU,width=.5))+geom_bar(stat='identity') +
geom_text(aes(label=sprintf("%0.1f", round(Max_IBU, digits = 1))),vjust=-.5,size=2.5,check_overlap = T) +
xlab('States') +ylab('Max International Bitterness')+ggtitle('Max International Bitterness by State')
The State with the maximum ABV beer is Colorado (12.8%). This is Upslope Brewing Company’s Lee Hill Series Vol.5- Belgian Style Quadrupel Ale. The State with the maximum IBU beer is Oregon (138 IBU). This is Astoria Brewing Company’s Bitter Bitch Imperial IPA.
#Question 6
beer_imp%>% summarise(Mean=mean(ABV, na.rm=TRUE),
Median=median(ABV,na.rm=T),
Min=min(ABV,na.rm=T),
Max=max(ABV,na.rm=T),
SD=sd(ABV,na.rm=T),
N=n())
## Mean Median Min Max SD N
## 1 0.05976457 0.057 0.001 0.128 0.01337555 2410
beer_imp %>% filter(!is.na(ABV)) %>% ggplot(aes(x=ABV))+geom_histogram(aes(y=..density..),colour='black',fill='white')+
geom_density(alpha=.5, fill='#FF6666') #Plot the distribution of ABV
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The distribution of ABV is right skewed. ABV of beers around 5% has the most counts. There are total 2410 non-missing ABV values in this data set. The maximum ABV is 12.8%, the minimum ABV is .1%, the median ABV is 5.6%. The mean ABV is 5.98% and standard deviation of ABV is 1.35%.
#Question 7
beer_imp %>% filter(!is.na(ABV) &!is.na(IBU)) %>%
ggplot(aes(y=ABV, x=IBU))+geom_point(position='jitter')+geom_smooth(method=loess)
## `geom_smooth()` using formula 'y ~ x'
rcorr(beer_imp$ABV, beer_imp$IBU,type='pearson')
## x y
## x 1.00 0.76
## y 0.76 1.00
##
## n= 2410
##
##
## P
## x y
## x 0
## y 0
ggscatter(beer_imp,x='IBU', y='ABV',add='reg.line',conf.int=T,cor.coef = T,cor.method='pearson')
## `geom_smooth()` using formula 'y ~ x'
#Checked the linear correlation between ABV and IBU
Most beers with lower IBU (less than 50) have ABV values around 5%. When IBU value increases, ABV values spreads out. But most beers with IBU values above 50, their ABV values spread out within the region between 5% and 10%. We can find out that some dots have clearly linear regression as those are missing values which were replaced by linear regression method. Thus, the correlation coefficient is 0.76 which is probably larger than if I remove all missing values.
#Question 8 KNN
beer_imp %>% filter(is.na(beer_imp$Style)) #Print out all rows contains missing values in beer style column.
## Brew_ID Beer_Name Beer_ID ABV IBU Style
## 1 30 Special Release 2210 0.06309148 46.41795 <NA>
## 2 67 OktoberFiesta 2527 0.05300000 27.00000 <NA>
## 3 161 Kilt Lifter Scottish-Style Ale 1635 0.06000000 21.00000 <NA>
## 4 167 The CROWLER™ 1796 0.07715341 61.58128 <NA>
## 5 167 CAN'D AID Foundation 1790 0.05849041 41.26661 <NA>
## Ounces Brewery_Name City State
## 1 16 Cedar Creek Brewery Seven Points TX
## 2 12 Freetail Brewing Company San Antonio TX
## 3 12 Four Peaks Brewing Company Tempe AZ
## 4 32 Oskar Blues Brewery Longmont CO
## 5 12 Oskar Blues Brewery Longmont CO
beer_imp$Style[beer_imp$Beer_ID=='2210']='Red Ale - American Amber/Red'
beer_imp$Style[beer_imp$Beer_ID=='2527']='Lager - Märzen/Oktoberfest'
beer_imp$Style[beer_imp$Beer_ID=='1635']='Scottish Ale'
beer_imp$Style[beer_imp$Beer_ID=='1796']='IPA - AMERICAN'
beer_imp$Style[beer_imp$Beer_ID=='1790']='FRUITED ALE'
#Replaced five missing values in beer style column to real data found online.
df_beer_IPA=beer_imp %>% filter(!is.na(ABV) &!is.na(IBU)) %>%
filter(str_detect(Style, regex(str_c('\\b','IPA','\\b',sep=''), ignore_case = T))) #Find all IPA style beer
df_beer_IPA$Style=as.factor('IPA') #Replace all IPA style beer with style name 'IPA'
df_beer_Ale=beer_imp %>% filter(!is.na(ABV) &!is.na(IBU)) %>%
filter(str_detect(Style, regex(str_c('\\b','Ale','\\b',sep=''), ignore_case = T))) #Find all Ale style beer
df_beer_Ale$Style=as.factor('Ale') #Replace all Ale style beer with style name 'Ale'
df_beer_test=rbind(df_beer_IPA, df_beer_Ale) #Combine these two data frames
df_beer_test %>% ggplot(aes(x=ABV, y=IBU)) +geom_point(aes(colour=Style)) #Scatter plot for all IPA and Ale styles beer
iterations = 500
numks = 30
splitPerc = .7
masterAcc = matrix(nrow = iterations, ncol = numks)
set.seed(6)
trainIndices = sample(1:dim(df_beer_test)[1],round(splitPerc * dim(df_beer_test)[1])) #Shuffle the training and testing groups
beer_train = df_beer_test[trainIndices,]
beer_test = df_beer_test[-trainIndices,]
classifications=knn(beer_train[,c(4,5)],beer_test[,c(4,5)],beer_train$Style, prob = TRUE, k = 5)
CM = confusionMatrix(table(classifications,beer_test$Style))
classifications #Use KNN method to predict beer style by its ABV and IBU values
## [1] IPA IPA IPA Ale Ale IPA IPA IPA IPA IPA Ale IPA IPA IPA IPA Ale IPA IPA
## [19] Ale IPA Ale Ale IPA IPA Ale Ale Ale IPA IPA IPA IPA Ale IPA IPA Ale IPA
## [37] IPA IPA Ale IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale IPA IPA IPA Ale IPA
## [55] IPA IPA IPA IPA IPA IPA IPA IPA Ale IPA IPA IPA IPA IPA Ale IPA Ale IPA
## [73] IPA IPA Ale IPA IPA Ale IPA Ale Ale IPA IPA IPA Ale IPA IPA IPA IPA IPA
## [91] Ale Ale IPA Ale Ale IPA Ale Ale Ale IPA IPA IPA Ale IPA Ale IPA Ale Ale
## [109] IPA IPA Ale IPA IPA Ale Ale IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale Ale
## [127] Ale IPA IPA Ale Ale IPA IPA Ale IPA IPA IPA Ale Ale IPA Ale IPA Ale Ale
## [145] Ale Ale Ale IPA IPA Ale Ale IPA IPA IPA IPA Ale Ale Ale Ale Ale Ale Ale
## [163] Ale Ale IPA IPA IPA Ale IPA Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [181] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale
## [199] Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [217] Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale IPA
## [235] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [253] Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale IPA Ale Ale
## [271] Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale IPA IPA Ale Ale
## [289] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale IPA IPA
## [307] IPA Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [325] Ale Ale Ale Ale IPA IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [343] Ale IPA Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale
## [361] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale
## [379] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [397] IPA IPA Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale
## [415] IPA Ale Ale IPA IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA
## [433] Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale
## [451] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale
## attr(,"prob")
## [1] 0.6000000 1.0000000 0.6000000 0.6000000 0.6000000 0.5000000 1.0000000
## [8] 0.8888889 1.0000000 0.8000000 0.8235294 0.6000000 1.0000000 1.0000000
## [15] 1.0000000 0.6000000 0.6000000 1.0000000 0.8000000 0.8000000 0.6666667
## [22] 1.0000000 1.0000000 0.8888889 0.6000000 0.8000000 0.8000000 0.6000000
## [29] 0.8000000 0.8333333 0.8000000 0.6000000 0.8571429 0.8571429 0.8000000
## [36] 1.0000000 1.0000000 0.8000000 0.6000000 0.6000000 0.8000000 0.8571429
## [43] 0.9000000 0.6000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
## [50] 0.8888889 1.0000000 0.8000000 0.8333333 1.0000000 0.8571429 1.0000000
## [57] 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 0.5000000
## [64] 1.0000000 0.8333333 1.0000000 0.6666667 1.0000000 0.8000000 0.6000000
## [71] 0.8000000 0.8000000 0.6000000 0.6000000 1.0000000 1.0000000 0.6000000
## [78] 0.6000000 1.0000000 1.0000000 0.6666667 0.6000000 0.6000000 1.0000000
## [85] 0.6000000 0.6000000 0.8000000 0.8571429 0.8571429 0.8888889 0.8000000
## [92] 0.8000000 1.0000000 0.8000000 0.7142857 1.0000000 0.8000000 0.8000000
## [99] 0.8000000 0.6000000 1.0000000 0.8333333 0.8235294 0.6000000 1.0000000
## [106] 0.8571429 0.8333333 0.8000000 0.8000000 0.6000000 0.6000000 1.0000000
## [113] 0.6000000 0.8333333 0.8235294 0.6000000 1.0000000 0.8571429 0.8571429
## [120] 0.8571429 0.9000000 0.8888889 1.0000000 1.0000000 1.0000000 0.8000000
## [127] 0.6000000 1.0000000 0.8750000 0.6000000 0.8000000 0.8000000 0.9000000
## [134] 0.8000000 1.0000000 1.0000000 0.6000000 0.8000000 0.6000000 0.8000000
## [141] 0.8000000 0.6666667 0.8000000 0.8000000 0.8000000 0.8000000 0.6000000
## [148] 0.8333333 1.0000000 0.6000000 0.8235294 0.6000000 0.8333333 1.0000000
## [155] 1.0000000 0.6000000 1.0000000 0.6000000 0.6666667 0.6000000 0.8000000
## [162] 1.0000000 1.0000000 1.0000000 0.6000000 0.6000000 1.0000000 0.8000000
## [169] 0.6000000 1.0000000 0.8000000 0.8000000 1.0000000 0.7777778 1.0000000
## [176] 1.0000000 0.8000000 0.8000000 0.6000000 0.6000000 1.0000000 0.8333333
## [183] 0.7142857 1.0000000 1.0000000 0.6000000 0.8000000 0.6000000 1.0000000
## [190] 1.0000000 1.0000000 0.6666667 0.6000000 1.0000000 1.0000000 1.0000000
## [197] 0.6000000 0.8000000 1.0000000 0.6000000 0.6000000 1.0000000 1.0000000
## [204] 0.8000000 1.0000000 0.6000000 0.6000000 1.0000000 0.8333333 0.8000000
## [211] 1.0000000 0.6000000 1.0000000 1.0000000 0.8000000 0.6000000 0.6000000
## [218] 1.0000000 0.8000000 0.8333333 1.0000000 1.0000000 1.0000000 1.0000000
## [225] 0.6250000 0.8000000 1.0000000 1.0000000 1.0000000 0.8000000 0.6000000
## [232] 1.0000000 0.6000000 0.7500000 0.8000000 1.0000000 0.6000000 1.0000000
## [239] 1.0000000 1.0000000 0.7500000 0.6250000 0.6250000 1.0000000 0.8000000
## [246] 1.0000000 0.6000000 1.0000000 0.8235294 0.8000000 0.8000000 1.0000000
## [253] 0.6000000 0.6000000 1.0000000 0.8000000 0.8000000 1.0000000 1.0000000
## [260] 1.0000000 1.0000000 1.0000000 1.0000000 0.6000000 1.0000000 1.0000000
## [267] 0.8000000 0.6000000 1.0000000 0.8000000 1.0000000 0.8000000 0.8000000
## [274] 0.8000000 0.6000000 0.8000000 0.6000000 1.0000000 0.8000000 0.6000000
## [281] 1.0000000 1.0000000 1.0000000 0.8235294 0.6000000 1.0000000 1.0000000
## [288] 1.0000000 0.6000000 0.8000000 0.8000000 1.0000000 0.8000000 0.8000000
## [295] 1.0000000 0.8000000 1.0000000 0.6000000 1.0000000 1.0000000 0.6000000
## [302] 0.8000000 0.8000000 1.0000000 0.8333333 0.8000000 0.6000000 1.0000000
## [309] 1.0000000 0.6000000 0.7777778 0.8000000 0.8000000 1.0000000 1.0000000
## [316] 1.0000000 1.0000000 0.8000000 1.0000000 0.6000000 0.6666667 1.0000000
## [323] 1.0000000 0.6000000 1.0000000 0.8000000 1.0000000 0.8000000 1.0000000
## [330] 0.6000000 1.0000000 0.8000000 1.0000000 1.0000000 0.6666667 1.0000000
## [337] 0.6000000 1.0000000 0.8000000 0.7777778 1.0000000 1.0000000 1.0000000
## [344] 0.8333333 1.0000000 0.6000000 1.0000000 1.0000000 1.0000000 0.6000000
## [351] 0.8333333 0.6250000 1.0000000 1.0000000 1.0000000 1.0000000 0.6000000
## [358] 1.0000000 0.6000000 0.6250000 1.0000000 0.8000000 1.0000000 1.0000000
## [365] 1.0000000 0.6000000 0.6666667 1.0000000 1.0000000 0.7777778 1.0000000
## [372] 1.0000000 0.6000000 1.0000000 0.7777778 0.6000000 1.0000000 1.0000000
## [379] 1.0000000 0.6000000 0.6000000 1.0000000 0.6000000 1.0000000 0.8000000
## [386] 1.0000000 0.8000000 0.6000000 1.0000000 0.6250000 1.0000000 1.0000000
## [393] 1.0000000 0.6666667 1.0000000 0.6000000 0.6000000 0.6000000 0.5555556
## [400] 1.0000000 0.8000000 0.8888889 1.0000000 1.0000000 1.0000000 1.0000000
## [407] 0.8000000 1.0000000 0.8000000 0.6250000 0.6000000 0.8000000 0.8000000
## [414] 0.6000000 0.8000000 0.6666667 1.0000000 0.8333333 0.6000000 1.0000000
## [421] 0.6000000 1.0000000 1.0000000 1.0000000 1.0000000 0.6000000 1.0000000
## [428] 1.0000000 0.8000000 0.6666667 1.0000000 0.8000000 0.6666667 1.0000000
## [435] 1.0000000 0.8000000 1.0000000 1.0000000 0.6000000 1.0000000 1.0000000
## [442] 0.6666667 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8235294
## [449] 0.8571429 1.0000000 0.8571429 1.0000000 1.0000000 1.0000000 0.8000000
## [456] 0.8000000 1.0000000 0.8000000 0.8000000 1.0000000 0.8000000 0.8333333
## [463] 1.0000000 1.0000000 0.8000000
## Levels: IPA Ale
CM
## Confusion Matrix and Statistics
##
##
## classifications IPA Ale
## IPA 102 37
## Ale 54 272
##
## Accuracy : 0.8043
## 95% CI : (0.7653, 0.8394)
## No Information Rate : 0.6645
## P-Value [Acc > NIR] : 1.738e-11
##
## Kappa : 0.5489
##
## Mcnemar's Test P-Value : 0.09349
##
## Sensitivity : 0.6538
## Specificity : 0.8803
## Pos Pred Value : 0.7338
## Neg Pred Value : 0.8344
## Prevalence : 0.3355
## Detection Rate : 0.2194
## Detection Prevalence : 0.2989
## Balanced Accuracy : 0.7671
##
## 'Positive' Class : IPA
##
I replaced five missing values in beer style column missing by real beer style I got online. I randomly assigned 70% of original data to training data and 30% of the original data as testing data when seed value is set to 6. We can tell when k=5, the accuracy is about 79%.
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(30), k = numeric(30))
trainIndices = sample(1:dim(df_beer_test)[1],round(splitPerc * dim(df_beer_test)[1]))
beer_train = df_beer_test[trainIndices,]
beer_test = df_beer_test[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(beer_train[,c(4,5)],beer_test[,c(4,5)],beer_train$Style, prob = TRUE, k = i)
table(classifications,beer_test$Style)
CM = confusionMatrix(table(classifications,beer_test$Style))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
p=ggplot(mapping=aes(x=seq(1,numks,1), y=MeanAcc))+geom_line()
ggplotly(p) #Shuffle the training and testing groups 500 times and loop the k value from 1 to 30
I shuffled the training and testing data 500 times by 70%/30% split. And assigned integer k value from 1 to 30. From the plot, we can tell when k=5, it gives us the highest accuracy 85% which means we can predict the beer is either India Pale Ales or any other types of Ale by knowing its ABV and IBU values with 85% accuracy when sets nearest neighbor numbers equals to 5
#Naive Bayes--------------
iterations = 500
masterAcc = matrix(nrow = iterations)
masterSen = matrix(nrow = iterations)
masterSpec = matrix(nrow = iterations)
splitPerc = .7
for(j in 1:iterations)
{
trainIndices = sample(1:dim(df_beer_test)[1],round(splitPerc * dim(df_beer_test)[1]))
beer_train = df_beer_test[trainIndices,]
beer_test = df_beer_test[-trainIndices,]
model = naiveBayes(beer_train[,c(4,5)],as.factor(beer_train$Style),laplace = 1)
table(predict(model,beer_test[,c(4,5)]),as.factor(beer_test$Style))
CM = confusionMatrix(table(predict(model,beer_test[,c(4,5)]),as.factor(beer_test$Style)))
masterAcc[j] = CM$overall[1]
masterSen[j] = CM$byClass[1]
masterSpec[j] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.7954065
MeanSen
## [1] 0.6750198
MeanSpec #Use Naive Bayes model to predict the mean accuracy by shufftling training and testing groups 500 times
## [1] 0.8657228
Now I chose to use Naive Bayes model as I want to compare the accuracy with KNN model. I shuffled the training and testing data 500 times by 70%/30% split like I did in KNN model. The mean accuracy of Naive Bayes is 84% which is similar to what we got from KNN.
#Question 9 ABV and IBU average by state
Mean_ABV_IBU=beer_imp %>% group_by(State) %>% summarise(Mean_ABV=mean(ABV, na.rm=TRUE), Mean_IBU=mean(IBU,na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
colnames(Mean_ABV_IBU)[colnames(Mean_ABV_IBU)=='State']='abbr'
statedata= statepop[,c(2,3)]
statedata$abbr=as.factor(statedata$abbr)
Mean_ABV_IBU$abbr=as.factor(Mean_ABV_IBU$abbr)
statedata1=merge(statedata,Mean_ABV_IBU,by='abbr',all.x=T)
statedata2=cbind(statedata1,Mean_ABV_IBU)
statedata3=statedata2[,c(1,2,6,7)]
colnames(statedata3)[colnames(statedata3)=='full']='state' #Change the column name to state before plotting.
plot_usmap(data = statedata3, values = "Mean_ABV", color = "red") +
scale_fill_continuous(
low = "white", high = "red", name = "Mean_ABV", label = scales::comma
) + theme(legend.position = "right") + ggtitle('Mean ABV by State') #Plot average ABV by state
plot_usmap(data = statedata3, values = "Mean_IBU", color = "black") +
scale_fill_continuous(
low = "white", high = "blue", name = "Mean_IBU", label = scales::comma
) + theme(legend.position = "right") + ggtitle('Mean IBU by State') #Plot average IBU by state
If the states’ colors are close to white, it means lower mean IBU and ABV.
#Back to Question 7
df_beer_study=rbind(df_beer_sort, df_beer_Ale)
## Error in rbind(df_beer_sort, df_beer_Ale): object 'df_beer_sort' not found
df_beer_study %>% ggplot(aes(y=ABV, x=IBU))+geom_point(position='jitter')+geom_smooth() +facet_wrap(~Style)
## Error in eval(lhs, parent, parent): object 'df_beer_study' not found
Now we came back to Question 7 by checking four different styles of beer. We can see, majority of lager style beer has low IBU and its ABV values are around 5%. No ABV values of lager beer are above 7.5%. ABV values of Indian Pale Ale style seems like increases when IBU increases, but their ABV values don’t pass 10% and majority of their ABV values are between 5% to 10%. The other types of Ale beer have low IBU values which most of them are below 50. Also, most of their ABV values are between 3.75% to 10%. The data size of stout style beer is small, but we can roughly tell their ABV increases when IBU increases.